# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# TREES

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install ISLP;

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt;
from ISLP import load_data

# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')

# 1. Classification trees

# Define a categorical variable ECO
ECO = np.where(Auto['mpg'] > Auto['mpg'].median(), "Economy", "Consuming")
Cars = Auto.assign(ECO=ECO)  # Include ECO into the data set

# Check class distribution
print(Cars['ECO'].value_counts())

ECO
Consuming    196
Economy      196
Name: count, dtype: int64

# Build initial tree to classify ECO based on all predictors (excluding 'name' if it exists in the data)
X = Cars.drop(columns=['ECO', 'name'], errors='ignore')
y = Cars['ECO']
clf = DecisionTreeClassifier()
clf.fit(X, y)

# Display the tree structure
plot_tree(clf, feature_names=X.columns, class_names=clf.classes_, filled=True)

[Text(0.5, 0.75, 'mpg <= 22.75\ngini = 0.5\nsamples = 392\nvalue = [196, 196]\nclass = Consuming'),
 Text(0.25, 0.25, 'gini = 0.0\nsamples = 196\nvalue = [196, 0]\nclass = Consuming'),
 Text(0.75, 0.25, 'gini = 0.0\nsamples = 196\nvalue = [0, 196]\nclass = Economy')]

# Of course, classifying ECO based on mpg is trivial!!! The tree picks this obvious split immediately.
# So, we’ll exclude mpg. We would like to predict ECO based on the car’s technical characteristics.

# Refine the tree excluding 'mpg' to predict ECO based on other technical characteristics
X_refined = Cars[['horsepower', 'weight', 'acceleration']]
clf_refined = DecisionTreeClassifier()
clf_refined.fit(X_refined, y)

# Display refined tree
plot_tree(clf_refined, feature_names=X_refined.columns, class_names=clf_refined.classes_, filled=True)
plt.show()

# This is too small to read. We can increase the figure size:
plt.figure(figsize=(30, 20))  # Adjust width and height as needed
plot_tree(clf_refined, feature_names=X_refined.columns, class_names=clf_refined.classes_, filled=True)
plt.show()

# Model Summary
print("Number of terminal nodes:", clf_refined.get_n_leaves())
print("Tree depth:", clf_refined.get_depth())

Number of terminal nodes: 51
Tree depth: 11

# Cross-validation for better classification rate estimation
train_data, test_data, train_labels, test_labels = train_test_split(X_refined, y, test_size=0.5, random_state=42)
clf_cv = DecisionTreeClassifier()
clf_cv.fit(train_data, train_labels)

# Predictions and confusion matrix
pred_labels = clf_cv.predict(test_data)
conf_matrix = confusion_matrix(test_labels, pred_labels)
print("Confusion matrix:\n", conf_matrix)
print("Classification accuracy:", accuracy_score(test_labels, pred_labels))

Confusion matrix:
 [[75 25]
 [ 6 90]]
Classification accuracy: 0.8418367346938775

# Pruning the tree by cross-validation to find optimal size
path = clf_refined.cost_complexity_pruning_path(X_refined, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Plotting cross-validated error and deviance
scores = [cross_val_score(DecisionTreeClassifier(ccp_alpha=alpha), X_refined, y, cv=5).mean() for alpha in ccp_alphas]
plt.figure()
plt.plot(ccp_alphas, impurities, marker='o', label="Deviance")
plt.plot(ccp_alphas, scores, marker='x', label="Classification rate")
plt.xlabel("Effective alpha")
plt.ylabel("Total impurity of leaves / Classification rate")
plt.legend()
plt.show()

# Pruning the tree based on optimal alpha (smallest misclassification error)
optimal_alpha = ccp_alphas[np.argmax(scores)]
pruned_clf = DecisionTreeClassifier(ccp_alpha=optimal_alpha)
pruned_clf.fit(X_refined, y)

# Display pruned tree
plot_tree(pruned_clf, feature_names=X_refined.columns, class_names=pruned_clf.classes_, filled=True)
plt.show()

# 2. Regression Trees

from sklearn.tree import DecisionTreeRegressor

# Define the target and features for regression tree
X_reg = Cars.drop(columns=['mpg', 'name', 'origin','ECO'], errors='ignore').assign(origin=Cars['origin'].astype('category').cat.codes)
y_reg = Cars['mpg']
reg_tree = DecisionTreeRegressor()
reg_tree.fit(X_reg, y_reg)

DecisionTreeRegressor()

DecisionTreeRegressor()

# Display regression tree
plt.figure(figsize=(50, 40))  # Set figure size
plot_tree(
    reg_tree,
    feature_names=X_reg.columns,
    filled=True,
    fontsize=10  # Adjust this as needed for readability
)
plt.show()

# Summary of the regression tree
print("Number of terminal nodes:", reg_tree.get_n_leaves())
print("Tree depth:", reg_tree.get_depth())
print("Residual mean deviance:", np.mean((y_reg - reg_tree.predict(X_reg))**2))

Number of terminal nodes: 321
Tree depth: 17
Residual mean deviance: 0.0